1. Understanding our data and quick review
2. Preprocessing
3. Building first neural network
4. Checking results
5. Plots for our first neural network
6. Bulding second neural network
7. Checking results
8. Plots for second NN
9. Last NN using StratifiedKfold
10. Results
11. Summarizing results and choosing best neural network
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas_profiling
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.impute import KNNImputer
from pandas_profiling import ProfileReport
from keras import backend as K
from keras.layers import Dropout
from sklearn.metrics import roc_curve,auc
data = pd.read_csv('data/logreg.txt', sep=';', dtype={'genotype':'float32'})
data.head()
data.shape
good_prediction = data[data['genotype'] == 1].shape[0]
bad_prediction = data[data['genotype'] == 0].shape[0]
print(str(round(good_prediction * 100 / (good_prediction + bad_prediction), 2)) + '% good SNP')
print(str(round(bad_prediction * 100 / (good_prediction + bad_prediction), 2)) + '% bad SNP')
data['genotype'][data['genotype'] == 0.0] = 2.0
data['genotype'][data['genotype'] == 1.0] = 0.0
data['genotype'][data['genotype'] == 2.0] = 1.0
data['BEFORE'] = data['BEFORE1'] + data['BEFORE2'] + data['BEFORE3']
data['BEHIND'] = data['BEHIND1'] + data['BEHIND2'] + data['BEHIND3']
data = data.drop(['BEFORE1','BEFORE2','BEFORE3','BEHIND1','BEHIND2','BEHIND3'], axis=1)
data.profile_report()
data['DP2']=data['DP2'].fillna(data['DP2'].mean())
data = data.dropna(axis=1)
data = pd.get_dummies(data, prefix=['BEFORE', 'BEHIND'])
data.shape
data.head()
data_x = data.iloc[:, 1:]
data_y = data.iloc[:, :1]
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, stratify=data_y, test_size=0.1, random_state=10)
y_train.genotype.value_counts()[0] / (y_train.genotype.value_counts()[1] + y_train.genotype.value_counts()[0])
y_test.genotype.value_counts()[0] / (y_test.genotype.value_counts()[1] + y_test.genotype.value_counts()[0])
def standardize(column):
mean = x_train[column].mean()
std = x_train[column].std()
x_train.loc[:, column] = (x_train[column] - mean) / std
x_test.loc[:, column] = (x_test[column] - mean) / std
standardize('QUAL')
standardize('DP')
standardize('DP2')
standardize('CALL')
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, test_size=0.1, random_state=11)
input_shape = x_train.shape[1]
x_train.shape
def recall_m(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision_m(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def f1_m(y_true, y_pred):
precision = precision_m(y_true, y_pred)
recall = recall_m(y_true, y_pred)
return 2*((precision*recall)/(precision+recall+K.epsilon()))
model = Sequential()
model.add(Dense(512, input_shape=(134,), activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=[f1_m])
model.summary()
callback_list = [
EarlyStopping(
monitor='f1_m',
patience=5,
mode='max'
),
ModelCheckpoint(
filepath='my_model.h5',
monitor='f1_m',
save_best_only=True,
mode='max'
)
]
history = model.fit(x_train, y_train,
batch_size=512,
epochs=200,
callbacks=callback_list,
validation_data=(x_val, y_val))
results = model.evaluate(x_test, y_test)
print('loss: ', results[0])
print('F1-score: ', results[1])
y_pred = model.predict(x_test)
y_pred_class = model.predict_classes(x_test)
cm = confusion_matrix(y_test,y_pred_class)
cr = classification_report(y_test,y_pred_class)
ax= plt.subplot()
sn.heatmap(cm, annot=True, fmt='g', cmap="Blues", ax = ax)
ax.set_xlabel('True labels');ax.set_ylabel('Predicted labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['0', '1'])
ax.yaxis.set_ticklabels(['0', '1'])
print(cr)
print('Precision:', str(round(cm[0][0] * 100 / (cm[0][0] + cm[1][0]), 2)) + '%')
print('Sensitivity:', str(round(cm[0][0] * 100 / (cm[0][0] + cm[0][1]),2)) + '%')
print('Specificity:', str(round((cm[1][1] *100 / (cm[1][1] + cm[0][1])),2)) + '%')
Recall metric shows how many relevant samples are selected, which means how well our model can predict all the interested samples in our dataset.
Precision metric tells us how many predicted samples are relevant i.e. our mistakes into classifying sample as a correct one if it’s not true.
Protability that for good SNP (0) classification will be correct (0)
balanced_accuracy_score(y_test, y_pred_class)
$ MCC = \frac{tp \times tn - fp \times fn}{\sqrt{(tp + fp)(tp + fn)(tn + fp)(tn + fn)}}. $
matthews_corrcoef(y_test, y_pred_class)
plt.plot(history.history['f1_m'])
plt.plot(history.history['val_f1_m'])
plt.title('model F1 score')
plt.ylabel('F1')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
lw = 2
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
def plot_roc_curve(fpr,tpr):
plt.plot(fpr,tpr,color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.axis([0,1,0,1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()
plot_roc_curve (fpr,tpr)
model = Sequential()
model.add(Dense(512, input_shape=(134,), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=[f1_m])
weights = {0:1, 1:5}
history = model.fit(x_train, y_train,
class_weight=weights,
batch_size=512,
epochs=100,
validation_data=(x_val, y_val))
results = model.evaluate(x_test, y_test)
print('loss: ', results[0])
print('F1-score: ', results[1])
y_pred = model.predict(x_test)
y_pred_class = model.predict_classes(x_test)
cm = confusion_matrix(y_test,y_pred_class)
cr = classification_report(y_test,y_pred_class)
ax= plt.subplot()
sn.heatmap(cm, annot=True, fmt='g', cmap="Blues", ax = ax)
ax.set_xlabel('True labels');ax.set_ylabel('Predicted labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['0', '1'])
ax.yaxis.set_ticklabels(['0', '1'])
print(cr)
print('Precision:', str(round(cm[0][0] * 100 / (cm[0][0] + cm[1][0]), 2)) + '%')
print('Sensitivity(Recall):', str(round(cm[0][0] * 100 / (cm[0][0] + cm[0][1]),2)) + '%')
print('Specificity:', str(round((cm[1][1] *100 / (cm[1][1] + cm[0][1])),2)) + '%')
balanced_accuracy_score(y_test, y_pred_class)
matthews_corrcoef(y_test, y_pred_class)
plt.plot(history.history['f1_m'])
plt.plot(history.history['val_f1_m'])
plt.title('model F1 score')
plt.ylabel('F1')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
lw = 2
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plot_roc_curve (fpr,tpr)
skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=3)
models_results = []
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
i = 1
for train_index, test_index in skf.split(data_x, data_y):
x_train, x_test = data_x.iloc[train_index], data_x.iloc[test_index]
y_train, y_test = data_y.iloc[train_index], data_y.iloc[test_index]
standardize('QUAL')
standardize('DP')
standardize('CALL')
standardize('DP2')
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, test_size=0.1, random_state=11)
model = Sequential()
model.add(Dense(512, input_shape=(134,), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=[f1_m])
weights = {0:1, 1:7.5}
history = model.fit(x_train, y_train,
class_weight=weights,
batch_size=512,
epochs=25,
validation_data=(x_val, y_val))
loss_and_f1 = model.evaluate(x_test, y_test)
y_pred = model.predict(x_test)
y_pred_class = model.predict_classes(x_test)
cm = confusion_matrix(y_test,y_pred_class)
cr = classification_report(y_test,y_pred_class)
results = (loss_and_f1[0], loss_and_f1[1], cm, cr, y_pred_class, y_test)
models_results.append(results)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
mean_tpr += np.interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
i+=1
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
mean_tpr /= 6
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) with cross validation ')
plt.legend(loc="lower right")
plt.show()
sr_acc = 0
sr_mcc = 0
print('Results for every fold: ' + '\n\n')
for i, j in enumerate(models_results):
print('-- Fold',i + 1,'--\n')
print('F1 score:',str(round(j[1], 4) * 100) + '%')
print('Loss:',round(j[0], 4))
print('\n' + 'Confusion matrix:' + '\n',j[2])
print('\n' + 'Precision:', str(round(j[2][0][0] * 100 / (j[2][0][0] + j[2][1][0]),2)) + '%')
print('Sensitivity(Recall):', str(round(j[2][0][0] * 100 / (j[2][0][0] + j[2][0][1]), 2)) + '%')
ba_acc = balanced_accuracy_score(j[5], j[4])
print('\n' + 'Balanced Accuracy Score:(=AUC)', ba_acc )
sr_acc += ba_acc
mcc = matthews_corrcoef(j[5], j[4])
print('\n' + 'MCC:', mcc)
sr_mcc += mcc
print('\n',j[3])
print('\n\n')
sm = models_results[0][2] + models_results[1][2] + models_results[2][2] + models_results[3][2] + models_results[4][2] + models_results[5][2]
av_cm = sm//6
ax= plt.subplot()
sn.heatmap(av_cm, annot=True, fmt='g', cmap="Blues", ax = ax)
ax.set_xlabel('True labels');ax.set_ylabel('Predicted labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['0', '1'])
ax.yaxis.set_ticklabels(['0', '1'])
print('Averge results:')
average = str(round((models_results[0][1] + models_results[1][1] + models_results[2][1] + models_results[3][1] +
models_results[4][1] + models_results[5][1]) / 6, 4)*100) + '%'
print('\n' + 'F1 score:', average)
print('Precision:', str(round(av_cm[0][0] * 100 / (av_cm[0][0] + av_cm[1][0]), 2)) + '%')
print('Sensitivity(Recall):', str(round(av_cm[0][0] * 100 / (av_cm[0][0] + av_cm[0][1]),2)) + '%')
print('Mean balanced accuracy score(=AUC):', str(round(sr_acc / 6, 4)))
print('Mean MCC:', str(round(sr_mcc / 6, 4)))
| Recall | Precision | balanced accuracy score | MCC | F1-SCORE | |
|---|---|---|---|---|---|
| Frist NN | 98.66% | 97.89% | 0.68 | 0.40 | 23% |
| Second NN | 99.08% | 97.95% | 0.69 | 0.46 | 25% |
| Last NN (averge results) | 97.52% | 98.21% | 0.7235 | 0.41 | 13.93% |